##############################################################################
# File-Name: 04_analysis_attrition_balance.r
# Purpose: Main script to add tables for attrition analysis 
# Output:supplemental materials, table 4-8
# status: ongoing
# Machine: MacOS High Sierra
##############################################################################

# Packages ----------------------------------------------------------------
pacman::p_load("lme4", "marginaleffects", "sandwich", "ri2", "lmtest", "clubSandwich","tidyverse", "here", "janitor", "modelsummary")

# Open pre-treatment data -------------------------------------------------
temp <- read_rds("data/pre_treatment.rds")

# variable names for balance
vars_balance <- c("Age", 
                  "Gender", 
                  "Education", 
                  "Interest in Politics", 
                  "Ideology Self", 
                  "Polarization Candidates", 
                  "WP:Daily time", 
                  "WP:Frequency Politics", 
                  "WP:Frequency News", 
                  "WP:Frequency Family",
                  "WP:Frequency Images about Politics",
                  "N Active Social Media Apps", 
                  "Duration Pre-treatment")

# RI ----------------------------------------------------------------------
## Function to perform randomization inference
ri_fun <- function(data, dv, covariates_for_model, sims=1000) {
    
    declaration <- declare_ra(N = nrow(data))
    
    
    balance_fun = function(data, covariates=covariates_for_model){
        summary(lm(as.formula(paste0(dv, "~", "." )) , 
                   data = data))$f[1]
    }
    
    ri2_out <-
        conduct_ri(
            test_function = balance_fun,
            declaration = declaration,
            assignment = dv,
            sharp_hypothesis = 0,
            data = data, 
            sims=sims
        )
    
    return(ri2_out)  
}

# Table 4: Enrolled vs Dropout in Treatment Instructions ---------------------------------

# make a temporary data to perform randomization inference
temp_ = temp %>% mutate(invited_enrolled=as.numeric(fct_rev(as.factor(invited_enrolled))))
temp_ = temp %>% mutate(invited_started=as.numeric(fct_rev(as.factor(invited_started))))


# fstats
outputs = ri_fun(temp %>% select(vars_balance, invited_enrolled) %>% mutate(invited_enrolled=as.numeric(as.factor(invited_enrolled))), 
                 dv="invited_enrolled", covariates_for_model=vars_balance)


# table
datasummary_balance(~invited_enrolled,
                    data =  temp %>% select(vars_balance, invited_enrolled),
                    title = "",
                    notes = "", 
                    dinm_statistic = "p.value",
                    add_rows = tibble(a="Omnibus Test:", 
                                      b="F-Statistics = 6.91, p-values < 0.01", 
                                      c="", 
                                      d="", 
                                      e="", 
                                      f="", 
                                      g=""),
                    fmt=2,
                    output= "output/sm_tab4.tex")


# Table 5: Treatment vs control enrolled sample ---------------------------------

# f-test
outputs = ri_fun(temp %>% select(vars_balance, exp) %>% mutate(exp=as.numeric(as.factor(exp))), dv="exp", covariates=vars_balance)

datasummary_balance(~exp,
                    data =  temp %>%  filter(enrolled==1)%>% select(vars_balance, exp)  %>% mutate(exp=fct_rev(as.factor(exp))),
                    title = "",
                    notes = "", 
                    dinm_statistic = "p.value",
                    fmt=2, 
                    add_rows = tibble(a="Omnibus Test:", 
                                      b="F-Statistics = 0.56, p-values = 0.89", 
                                      c="", 
                                      d="", 
                                      e="", 
                                      f="", 
                                      g=""), 
                    output="output/sm_tab5.tex")



# Table 6 : Enrolled vs Dropout in Post-Treatment survey ---------------------------------
outputs = ri_fun(temp %>% select(vars_balance, invited_completed) %>% 
                     mutate(invited_completed=as.numeric(as.factor(invited_completed))), 
                 dv="invited_completed", covariates=vars_balance)


datasummary_balance(~invited_completed,
                    data =  temp %>% select(vars_balance, invited_completed),
                    title = "",
                    notes = "", 
                    dinm_statistic = "p.value",
                    add_rows = tibble(a="Omnibus Test:", 
                                      b="F-Statistics = 0.73, p-values = 0.72", 
                                      c="", 
                                      d="", 
                                      e="", 
                                      f="", 
                                      g=""),
                    fmt=2,  
                    output="output/sm_tab6.tex")


# Table 7: Treatment vs control enrolled sample ---------------------------------
outputs = ri_fun(temp %>% filter(completes==1) %>%
                     select(vars_balance, exp) %>% 
                     mutate(exp=as.numeric(as.factor(exp))), 
                 dv="exp", covariates=vars_balance)

datasummary_balance(~exp,
                    data =  temp %>%  filter(completes==1)%>% select(vars_balance, exp),
                    title = "",
                    notes = "", 
                    dinm_statistic = "p.value",
                    add_rows = tibble(a="Omnibus Test:", 
                                      b="F-Statistics = 0.45, p-values = 0.96", 
                                      c="", 
                                      d="", 
                                      e="", 
                                      f="", 
                                      g=""),
                    fmt=2, output="output/sm_tab7.tex")

# Ghanem et al Tests ------------------------------------------------------

# build treatment and control attriters
temp <- temp %>%
    mutate(gh=case_when(exp=='control' & invited_completed=="Completes" ~ "CR", 
                        exp=='treatment' & invited_completed=="Completes" ~ "TR",
                        exp=='control' & invited_completed=="Dropout in Post-Treatment" ~ "CA", 
                        exp=='treatment' & invited_completed=="Dropout in Post-Treatment" ~ "TA"
    ), 
    gh = fct_relevel(gh, c("TR", "CR", "TA", "CA")), 
    attrit = ifelse(invited_completed=="Completes", 1, 0)) %>% 
    drop_na(invited_completed) 



# differential attrition
diff_attrit <- lm(attrit ~ exp , data=temp)


## outcomes
dv_labels <- c("Interest in Politics", 
               "Trust elections", 
               "False News Exposure", 
               "Affective Polarization",
               "WP:Daily time")

dv <- temp %>% select(all_of(c("Interest in Politics", 
                                 "Trust elections", 
                                 "q_fake_news", 
                                 "affective_pol_diff_all",
                                 "WP:Daily time"))) %>%  clean_names() %>% colnames()

## clean names
temp <- temp %>% clean_names()

# function to calculate ghahem attrition bias
attrition_bias <- function(dv, dv_label, data){
    
    # model  
    model_linear = lm(as.formula(paste0(dv, "~ gh")), data=data)
    
    # parameters
    parameters <-  model_linear %>%
        avg_predictions(by="gh") %>%
        tidy() %>%
        select(gh, estimate) %>%
        pivot_wider(names_from=gh, 
                    values_from=estimate)
    
    # joint test
    p = car::linearHypothesis(model_linear, c("ghTA - ghCA =0", "ghCR=0"))[2, "Pr(>F)"]
    
    # output
    tibble(Outcome= dv_label,
           C="0.96", 
           Differential = "-0.024 (p-value = 0.125)") %>%
        bind_cols(parameters) %>%
        bind_cols(pvalue=p)
}


# Convert factor to numeric
temp$trust_elections <- as.numeric(temp$trust_elections)

# run test
res <- map2(dv, dv_labels, ~ attrition_bias(.x, .y, temp)) %>% bind_rows()

# round results
res = res %>% mutate_if(is.numeric, round, digits=2)

# latex table
library(kableExtra)

# Produces the table.
kable(
    res, 
    caption = "Sample Internal Validity Test for Primary Outcomes",
    format = "latex", booktabs = T, escape = F, linesep = "",
    row.names = F, label = "tab:ghanem") %>%
    kable_styling(full_width = F, latex_options = c("HOLD_position", "scale_down")) %>%
    add_header_above(c(" " = 1, "Attrition Bias" = 2, 
                       "Mean baseline outcome by group" = 4, "Test of internal validity" = 1))  %>%
    column_spec(1, width = "7cm") %>%
    footnote(general = "Column 1 reports the attrition rate for control, and Column 2 reports the differential attrition rate between treatment and control, with the corresponding p-value testing for difference in attrition between the groups (\textit{differential attrition}). Columns 3-6 present the mean baseline outcome for treatment respondents (TR), control respondents (CR), treatment attritters (TA), and control attritters (CA), respectively. Column 7 reports the p-value of the hypothesis test with two equality restrictions (\textit{selective attrition}",
             threeparttable = T) %>%
    save_kable(file ="output/sm_tab8.tex")



# Balance Tables ----------------------------------------------------------

# To preserve some code structure, I will rename some variables and datasets here.
all <- temp

# get quotas from the Brazilian Election Survey
eseb <- read_csv("data/eseb_quotas.csv")

# add percentage in the quotas
eseb = eseb %>% 
    mutate_at(vars(3:5), ~as.numeric(str_remove(.x, "%"))/100)

# Redefine age category
age = eseb %>%
    filter(category=="age") 
age = c(rep(1, 14), rep(2, 20), rep(3, 21), rep(4, 24), rep(5, 21))

# Load the haven package
library(haven)

# Import the SPSS file with Brazilian Electoral Daata
data <- read_sav("data/04810.sav")

#Import dataset with online panel of WhatsApp users
br <- read_csv("data/brazil_2022.csv") %>% slice(-1:-2) %>% clean_names()

# Filter on WhatsApp Users
br_wpp = br %>%
    mutate(wpp=str_detect(q_network_usage1, "Converso por WhatsApp")) %>%
    filter(wpp==TRUE)


# Build Balance Table -----------------------------------------------------

## age ----

# Experiment sample
deact_age <- all %>% 
    rename("q_age"="age") %>%
    mutate(q_age=case_when(q_age==1~"18  and 24",
                           q_age==2~"25 and 34",
                           q_age==3~"35 and 44",
                           q_age==4~"45 and 54",
                           q_age==5~"55 and 64",
                           q_age==6~"+  65"), 
           q_age=fct_relevel(q_age, c("18  and 24","25 and 34", 
                                      "35 and 44", 
                                      "45 and 54", 
                                      "55 and 64", 
                                      "+  65"))) %>%
    tabyl(q_age) %>%
    select(categories=q_age, percent) %>%
    slice(1:6)

# Brazilian Electoral Survey Age
pop_age <- data %>% 
    select(D01A_IDADE) %>%
    mutate(categories=case_when(D01A_IDADE<25 ~ 1, 
                                D01A_IDADE>24 & D01A_IDADE<35 ~ 2, 
                                D01A_IDADE>34 & D01A_IDADE<45 ~ 3, 
                                D01A_IDADE>44 & D01A_IDADE<55 ~ 4, 
                                D01A_IDADE>54 & D01A_IDADE<65 ~ 5,
                                D01A_IDADE>65 ~ 6)) %>%
    tabyl(categories)  %>% drop_na()

# Online Panel Age
br_edad <- br_wpp  %>%
    tabyl(sd_edad) %>%
    drop_na() %>%
    select(categories=sd_edad, percent)


# Bind all the information
age = bind_cols(deact_age,
                pop_age %>% select(pop_percent=percent), 
                br_edad %>% select(wpp_percent=percent))

## gender ----
# Experiment sample
deact_gender <- all %>% 
    rename("q_gender"="gender") %>%
    mutate(q_gender=ifelse(q_gender==1, "Female", "Male")) %>%
    tabyl(q_gender) %>%
    select(categories=q_gender, percent) 

# Brazilian Election Survey
pop_gender <- data %>% 
    select(D02) %>%
    mutate(categories=case_when(D02==1 ~ "Male",
                                TRUE ~ "Female")) %>%
    tabyl(categories) 

# Online Panel
br_gender <- br_wpp %>% 
    select(sd_genero) %>%
    mutate(categories=case_when(sd_genero=="Masculino" ~ "Male",
                                TRUE ~ "Female")) %>%
    tabyl(categories) 

# Bind all gender information
gender = bind_cols(deact_gender, 
                   pop_gender %>% select(pop_percent=percent), 
                   br_gender %>% select(wpp_percent=percent))


## education -------

# Experiment sample
deact_educ <- all %>% 
    rename("q_education"="education") %>%
    tabyl(q_education) %>% 
    mutate(categories=c("Less than High School", 
                        "Less than High School", 
                        "High School", 
                        "Professional Degree", 
                        "College Degree", 
                        "Postgraduate", NA)) %>%
    select(categories, percent) %>%
    group_by(categories) %>%
    summarise(percent=sum(percent)) %>%
    drop_na()

# Brazilian election survey - education
pop_educ <- data %>% 
    tabyl(D03) %>% 
    mutate(labels=names(attr(data$D03, "labels"))) %>%
    select(labels, percent) %>%
    mutate(categories=c(rep("Less than High School",6), 
                        "High School", 
                        "Professional Degree", 
                        "College Degree", 
                        "Postgraduate", NA)) %>%
    group_by(categories) %>%
    summarise(pop_percent=sum(percent))

# online panel - education
wpp_educ <- br_wpp %>%
    tabyl(sd_educacion)  %>%
    select(sd_educacion, percent) %>%
    mutate(categories=c("Postgraduate", "College Degree",
                        "Less than High School", "Less than High School", "High School", 
                        "Less than High School", "College Degree", "High School")) %>%
    group_by(categories) %>%
    summarise(wpp_percent=sum(percent))

educ = left_join(deact_educ, pop_educ) %>% left_join(wpp_educ)   %>%
       mutate(categories=fct_relevel(categories, c("Less than High School", 
                                     "High School", 
                                     "Professional Degree", 
                                     "College Degree", 
                                     "Postgraduate"))) %>%
        arrange(categories)

## Region ----

# Experiment Sample
library(sf)
library(geobr)

# states 
states <- read_state(year = 2020)  %>% st_transform(4326) # 2020 is just an example year

# convert survey information
all_geo <- all %>%
    select(location_latitude, location_longitude) %>%
    st_as_sf(coords = c("location_longitude", "location_latitude"), crs = 4326)

# join
all_geo <- st_join(all_geo, states)

# recode categories
all_geo <- all_geo %>% 
    st_drop_geometry() %>%
    tabyl(name_region) %>%
    select(categories=name_region, 
           percent=valid_percent) %>%
    drop_na() %>%
    mutate(categories=ifelse(categories=="Centro Oeste", "Centro-Oeste", categories))

# Brazilian Election survey - Region
pop_geo <- data %>%
    tabyl(REG) %>% 
    mutate(categories=names(attr(data$REG, "labels"))) %>%
    select(categories, pop_percent=percent)

# Online panel data - Region
wpp_geo <- br_wpp %>%
    select(location_latitude, location_longitude) %>%
    st_as_sf(coords = c("location_longitude", "location_latitude"), crs = 4326)

# join
wpp_geo <- st_join(wpp_geo, states)

# Recode Region
wpp_geo <- wpp_geo %>% 
    st_drop_geometry() %>%
    tabyl(name_region) %>%
    select(categories=name_region, 
           wpp_percent = valid_percent) %>%
    drop_na() %>%
    mutate(categories=ifelse(categories=="Centro Oeste", "Centro-Oeste", categories))


# join
geo = left_join(all_geo, pop_geo) %>% left_join(wpp_geo)

## voting first rounf----

# experiment sample
deact_vote <- all %>%
    tabyl(vote_first_recode) %>%
    mutate(categories=c("Lula", "Others", "Bolsonaro")) %>%
    select(categories, percent)

# Brazilian Electoral Vote
pop_vote <- data %>% 
    tabyl(Q10P1b) %>%
    filter(Q10P1b==6 | Q10P1b==4) %>%
    mutate(categories=c("Bolsonaro", "Lula")) %>%
    select(categories, pop_percent=percent) %>%
    add_row(categories="Others", pop_percent=0.29)

# This information was not available in the Online Panel

# Join
vote <- left_join(deact_vote, pop_vote)

## Interest in politics ----

## Experiment Sample
deact_politics <- all %>% 
    rename("q_politics"="interest_in_politics") %>%
    tabyl(q_politics) %>%
    mutate(categories=c("Not at All and Not Interested in Politics", 
                        "Not at All and Not Interested in Politics", 
                        "Interested and Very Interested in Politics", 
                        "Interested and Very Interested in Politics")) %>%
    group_by(categories) %>%
    summarise(percent=sum(percent))

# brazilian election survey
pop_politics <- data %>%
    tabyl(Q01) %>%
    mutate(categories=c("Not at All and Not Interested in Politics", 
                        "Not at All and Not Interested in Politics", 
                        "Interested and Very Interested in Politics", 
                        "Interested and Very Interested in Politics", NA, NA)) %>%
    group_by(categories) %>%
    summarise(pop_percent=sum(percent))

## this information is not available in the online panel

# Join
politics = left_join(deact_politics, pop_politics)

## Trust in electoral systems -----

# experiment sample
trust_inst = all %>%
    rename("q_legitimacy"="trust_elections") %>%
    tabyl(q_legitimacy) %>%
    slice(5) %>%
    mutate(categories="Trust Electoral System") %>%
    select(categories, percent)

# Brazilian Election Survey
trust_pop = data %>% 
    tabyl(Q13) %>%
    slice(1) %>%
    mutate(categories="Trust Electoral System") %>%
    select(categories, pop_percent=percent)

## this information is not available in the online panel


# Join
trust <- left_join(trust_inst, trust_pop)

## Combine all table ----
balance <- bind_rows(age, 
                     gender, educ, geo, vote, politics, trust) %>%
    as_tibble() %>%
    mutate_if(is.numeric, ~ round(.x, digits = 2))

# save table
library(kableExtra)
kable(
    balance, 
    caption = "Balance Table: Enrolled Sample and Brazilian Election Survey",
    format = "latex", booktabs = T, escape = F, linesep = "",
    row.names = F, label = "balance") %>%
    kable_styling(full_width = F, latex_options = c("HOLD_position", "scale_down")) %>%
    #  add_header_above(c("Variables" = 1, "Deactivation Sample" = 2, 
    #                     "Brazilian Election Survey" = 3, 
    #                     "WhatsApp Users" = 4))  %>%
    pack_rows("Age", 1, 6) %>%
    pack_rows("Gender", 7, 8) %>%
    pack_rows("Education", 9, 13) %>%  
    pack_rows("Region", 14, 18) %>%
    pack_rows("Vote First Round", 19, 21) %>%
    pack_rows("Interest In Politics", 22, 23) %>%
    column_spec(1, width = "5cm") %>%
    footnote(general = "Column 1 reports categories from demographics and political variable. Column 2 reports the baseline values for participants who sucessfully enrolled in the Deactivation Experiment. Column 3 reports banchmark values based the Brazilian Election Survey from 2022, which consists on in-person, face-to-face household survey, representative of the Brazilian population. Columns 4 reports demographics from a online based panel of WhatsApp Users") %>%
    save_kable(file ="output/sm_tab3.tex")


